b'\n\n
\n \n \n\n
\n
\n \n OFFICIAL SENSITIVE\n \n
\n\n \n \n
\n\n This template demonstrates how to develop and share interactive reproducible analysis using Python and Jupyter Lab.\n
\n\n The data used in this template is for learning purposes only.\n
\n\n \n \n
\nimport numpy as np\nimport pandas as pd\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nimport hvplot.pandas\nimport datetime\nimport re\nimport locale\nfrom locale import atof\n\n \n The data sources used are fictional exam scores from Kaggle and a dataset from ONS on 18-24 student population over time. The datasets can be found at\n \n https://www.kaggle.com/spscientist/students-performance-in-exams\n \n and\n \n https://www.ons.gov.uk/peoplepopulationandcommunity/birthsdeathsandmarriages/livebirths/articles/howhasthestudentpopulationchanged/2016-09-20\n \n
\nexam_performance = pd.read_csv("data/students-performance.csv")\nstudents = pd.read_csv("data/total-students.csv")\n\n exam_performance.head(3)\n\n | \n | \n\n gender\n | \n\n race/ethnicity\n | \n\n parental level of education\n | \n\n lunch\n | \n\n test preparation course\n | \n\n math score\n | \n\n reading score\n | \n\n writing score\n | \n
|---|---|---|---|---|---|---|---|---|
| \n 0\n | \n\n female\n | \n\n group B\n | \n\n some college\n | \n\n free/reduced\n | \n\n completed\n | \n\n 71\n | \n\n 85\n | \n\n 83\n | \n
| \n 1\n | \n\n male\n | \n\n group B\n | \n\n some high school\n | \n\n standard\n | \n\n none\n | \n\n 41\n | \n\n 40\n | \n\n 34\n | \n
| \n 2\n | \n\n male\n | \n\n group C\n | \n\n high school\n | \n\n standard\n | \n\n none\n | \n\n 66\n | \n\n 52\n | \n\n 54\n | \n
\n The exam performance data comprises 5,000 records (rows) and 8 features (columns).\n
\nexam_performance.info()\n\n <class \'pandas.core.frame.DataFrame\'>\nRangeIndex: 5000 entries, 0 to 4999\nData columns (total 8 columns):\n # Column Non-Null Count Dtype \n--- ------ -------------- ----- \n 0 gender 5000 non-null object\n 1 race/ethnicity 5000 non-null object\n 2 parental level of education 5000 non-null object\n 3 lunch 5000 non-null object\n 4 test preparation course 5000 non-null object\n 5 math score 5000 non-null int64 \n 6 reading score 5000 non-null int64 \n 7 writing score 5000 non-null int64 \ndtypes: int64(3), object(5)\nmemory usage: 312.6+ KB\n\n
students.head(3)\n\n | \n | \n\n date\n | \n\n aged 18 to 24 in full-time education (thousands)\n | \n
|---|---|---|
| \n 0\n | \n\n Mar-May 1992\n | \n\n 984\n | \n
| \n 1\n | \n\n Apr-Jun 1992\n | \n\n 999\n | \n
| \n 2\n | \n\n May-Jul 1992\n | \n\n 1,012\n | \n
students.info()\n\n <class \'pandas.core.frame.DataFrame\'>\nRangeIndex: 291 entries, 0 to 290\nData columns (total 2 columns):\n # Column Non-Null Count Dtype \n--- ------ -------------- ----- \n 0 date 291 non-null object\n 1 aged 18 to 24 in full-time education (thousands) 291 non-null object\ndtypes: object(2)\nmemory usage: 4.7+ KB\n\n
\n Transform the 18 to 24 student population from string 1,000 format to decimal 1000.00 using ASCII to float (atof).\n
\nlocale.setlocale(locale.LC_NUMERIC, \'\')\nstudents[["aged 18 to 24 in full-time education (thousands)"]].applymap(atof)\nstudents["aged 18 to 24 in full-time education (thousands)"] = students["aged 18 to 24 in full-time education (thousands)"].map(atof)\nstudents["aged 18 to 24 in full-time education"] = students["aged 18 to 24 in full-time education (thousands)"] * 1000\n\n \n Add year and month columns to 18-24 student population data.\n
\ndef add_year_column_to_students():\n search = [] \n for values in students["date"]:\n search.append(re.search(r\'(\\d{4})\', values).group())\n\n students["year"] = search\n students["year"] = pd.to_datetime(students["year"]).dt.year\n \nadd_year_column_to_students()\n\n def add_month_column_to_students():\n students["month"] = students["date"].str.split(" ")[0][0]\n \nadd_month_column_to_students()\n\n \n \n \n
\n\n \n \n
\nstudents.groupby("year").mean().hvplot(x=\'year\', y=\'aged 18 to 24 in full-time education (thousands)\') \\\n* students.groupby("year").mean().hvplot.scatter(x=\'year\', y=\'aged 18 to 24 in full-time education (thousands)\')\n\n fig, ax = plt.subplots(figsize=(15,6))\nsns.lineplot(\n x="year", \n y="aged 18 to 24 in full-time education (thousands)", \n data=students\n)\nplt.xticks(rotation=15)\nplt.title("Young people aged 18 to 24 in full-time education, seasonally adjusted, UK," + \n " March to May 1992 to May to July 2016")\nplt.show()\n\n \n Student numbers (aged 18-24) have almost doubled since 1992. In 2016, the student population was around 1,896,000\n
\nstudents[["year", "aged 18 to 24 in full-time education"]] \\\n .groupby("year") \\\n .mean() \\\n .astype(int) \\\n .style.format({\n "aged 18 to 24 in full-time education": "{:,d}"\n })\n\n | \n | \n\n aged 18 to 24 in full-time education\n | \n
|---|---|
| \n year\n | \n\n | \n
| \n 1992\n | \n\n 1,034,500\n | \n
| \n 1993\n | \n\n 1,043,750\n | \n
| \n 1994\n | \n\n 1,117,750\n | \n
| \n 1995\n | \n\n 1,141,416\n | \n
| \n 1996\n | \n\n 1,133,166\n | \n
| \n 1997\n | \n\n 1,169,416\n | \n
| \n 1998\n | \n\n 1,231,416\n | \n
| \n 1999\n | \n\n 1,259,916\n | \n
| \n 2000\n | \n\n 1,257,750\n | \n
| \n 2001\n | \n\n 1,293,500\n | \n
| \n 2002\n | \n\n 1,301,333\n | \n
| \n 2003\n | \n\n 1,396,916\n | \n
| \n 2004\n | \n\n 1,420,666\n | \n
| \n 2005\n | \n\n 1,469,416\n | \n
| \n 2006\n | \n\n 1,487,916\n | \n
| \n 2007\n | \n\n 1,527,833\n | \n
| \n 2008\n | \n\n 1,609,250\n | \n
| \n 2009\n | \n\n 1,706,916\n | \n
| \n 2010\n | \n\n 1,825,500\n | \n
| \n 2011\n | \n\n 1,849,583\n | \n
| \n 2012\n | \n\n 1,880,666\n | \n
| \n 2013\n | \n\n 1,877,083\n | \n
| \n 2014\n | \n\n 1,872,833\n | \n
| \n 2015\n | \n\n 1,887,583\n | \n
| \n 2016\n | \n\n 1,896,571\n | \n